# This file contains R script that reproduces the results reported in "Roma and
# Bureaucrats: A Field Experiment in the Czech Republic."

# Authors:

# Štepán Mikula | stepan.mikula@econ.muni.cz
# Department of Economics, Masaryk University

# Josef Montag | josef.montag@gmail.com  
# Department of Economics, Faculty of Law, Charles University

# September 17, 2021 

# The main database is the anonymized dataset "Replication_dataset.csv," which
# is an output of the script "Prepare_replication_data__Do_not_run.R."

# The script also reproduces results, which we discus in the paper but do not report.

# You need to have installed the libraries required by the code.

# For better orientation, the file is split into sections indicated with two or
# three hashes (#) and Vim folding signs (e.g. {{{1).

# The script was tested to work in R version 4.1.1 (2021-08-10), R Core Team.
# 2021. "R: A language and environment for statistical computing." R Foundation
# for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.

### Preliminaries {{{1

rm(list=ls())

# Load libraries
library(tidyverse)
library(plm)
library(stargazer)
library(car)
library(exact2x2)
library(xtable)

# Useful functions

# Produce and print regression table (incl. clustered standard errors)
produce.and.print.table <- 
  function() {
    stargazer(models,
              se = lapply(models, function(m) {
                            sqrt(diag(vcovHC(m,type="HC3",cluster="group")))
                   }),
              type = 'text',
              star.cutoffs = c(0.05, 0.01, 0.001),
              keep.stat = c('adj.rsq', 'n'),
              dep.var.caption = '',
              dep.var.labels.include = F,
              column.labels=(c('All', 'All', 'Czechs', 'Roma', 'High',
                               'Low', 'HR LC'))
              )
  }

### Load the data {{{1

rdata <- read_csv('Data/Replication_dataset.csv')

### Response rates, by treatment arm: Figure 1 {{{1

# Compute counts of responses and non-responses by treatment arms
counts <- rdata %>% 
  group_by(ethnicity, literacy, response) %>%
  summarize(n = n()) %>%
  arrange(literacy, ethnicity, -response)

# Compute response rates and 83 % confidence intervals by treatment arms
bars <- 
counts %>%
  filter(response) %>%
  select(1:2) %>%
  bind_cols(
    bind_rows(
      (counts %>%
       filter(ethnicity == 'Czech' & literacy == 'High') %>%
       pluck('n') %>%
       binom.test(conf.level = 0.83))[c('estimate', 'conf.int')] %>%
       unlist,
      (counts %>%
       filter(ethnicity == 'Roma' & literacy == 'High') %>%
       pluck('n') %>%
       binom.test(conf.level = 0.83))[c('estimate', 'conf.int')] %>%
       unlist,
      (counts %>%
       filter(ethnicity == 'Czech' & literacy == 'Low') %>%
       pluck('n') %>%
       binom.test(conf.level = 0.83))[c('estimate', 'conf.int')] %>%
       unlist,
      (counts %>%
       filter(ethnicity == 'Roma' & literacy == 'Low') %>%
       pluck('n') %>%
       binom.test(conf.level = 0.83))[c('estimate', 'conf.int')] %>%
       unlist
     )
     )  
names(bars)[3] <- 'estimate'

# Produce and show Figure 1
bars %>%
ggplot(aes(x = ethnicity, y = estimate)) +
    geom_col(fill="black", alpha=0.5) +
    geom_errorbar(aes(x = ethnicity, ymin = conf.int1, ymax = conf.int2), 
                  width = .8, colour = "black", alpha=1, size=1) +
    coord_cartesian(ylim = c(0.2, 0.8)) +
    facet_wrap(~ paste(literacy, 'literacy')) +
    labs(y = 'Response rates and 83% CIs') +
    theme_minimal() +
    theme(text = element_text(size = 16,  family = "Times"),
          axis.text.x = element_text(size = 20,  family = "Times", 
                                   color = 'black'),
          axis.text.y = element_text(size = 18,  family = "Times", 
                                   color = 'black'),
          axis.title.x = element_blank(),
          axis.title.y = element_text(size = 20,  family = "Times", vjust =2),
          strip.text = element_text(size = 20,  family = "Times") 
          )
 
### Summary statistics: Table 1 {{{1

rdata_summary <- rdata %>% 
  # Here we specify variables used in summary stats
  select(literacy, ethnicity, where(is.numeric),where(is.logical)) %>% 
  select(-letter, -recipient.id, -excluded.share) 

# Compute summary statistics for full dataset
all <- rdata_summary %>% 
  select(where(is.numeric),where(is.logical)) %>% 
  add_tally(name = "Observations") %>% 
  summarise(
    across(
      .cols = everything(),
      .fns = list(
        mean = ~ mean(.x, na.rm = TRUE),
        sd = ~ sd(.x, na.rm = TRUE)
      ),
      .names = "{.col}.{.fn}"
    )
  ) %>% 
  pivot_longer(everything(), values_to = "All") %>% 
  separate(name,c("variable","stat"), sep = "\\.")

# Compute summary statistics by treatment arm
bygroup <- rdata_summary %>% 
  group_by(literacy, ethnicity) %>%
  select(where(is.numeric),where(is.logical)) %>% 
  add_tally(name = "Observations") %>% 
  summarise(
    across(
      .cols = everything(),
      .fns = list(
        mean = ~ mean(.x, na.rm = TRUE),
        sd = ~ sd(.x, na.rm = TRUE)
      ),
      .names = "{.col}.{.fn}"
    ),
    .groups = "drop"
  ) %>% 
  pivot_longer(-c(literacy, ethnicity)) %>% 
  pivot_wider(
    names_from = c(literacy,ethnicity)
  ) %>% 
  separate(name,c("variable","stat"), sep = "\\.")

## Compute F-tests
pF <- rdata_summary %>% 
  pivot_longer(-c(literacy,ethnicity)) %>% 
  split(.$name) %>% 
  map_dfr(
    function(x){
      lm(value ~ ethnicity*literacy, data = x) %>% 
        linearHypothesis(c('ethnicityRoma', 'literacyLow',
                           'ethnicityRoma:literacyLow')) %>% 
        .[6] %>% unlist() %>% .[2] %>% 
        tibble(
          pF = .,
          stat = "mean"
        )
    },
    .id = "variable"
  ) %>% 
  mutate(
    pF = ifelse(pF < 0.01,NA,pF),
    pF = format(pF, digits = 1, nsmall = 2, trim = TRUE),
    pF = ifelse(pF == "NA",'p < 0.01',pF)
  ) %>% 
  rename(pFtest = pF)

# Produce and print Table 1
left_join(all,bygroup) %>% 
  filter(!(variable == "Observations" & stat == "sd")) %>% 
  mutate(
    stat = ifelse(variable == "Observations","n",stat)
  ) %>% 
  pivot_longer(-c(stat,variable)) %>% 
  pivot_wider(names_from = "stat") %>% 
  mutate(
    mean = format(mean, digits = 1, scientific = FALSE, nsmall = 2,
                  trim = TRUE),
    sd = format(sd, digits = 1, scientific = FALSE, nsmall = 2, 
                trim = TRUE) %>% 
                  str_c("(",.,")"),
    n = n %>% as.integer() %>% format(trim = TRUE)
  ) %>% 
  pivot_longer(-c(variable,name), names_to = "stat") %>% 
  filter(value != "NA") %>% 
  pivot_wider() %>% 
  filter(!(variable == "Observations" & stat == "sd")) %>% 
  mutate(
    variable = factor(variable, levels = c(
      "response", "female", "prague", "city100k", "city50k", "time_to_reply","distinct_reply", "name_greet", "reply_nwords",
      "marked_spam","forwarded","automat", "Observations"
    ))
  ) %>% 
  arrange(variable,stat) %>% 
  mutate(
    variable = ifelse(stat != "sd", as.character(variable),"")
  ) %>% 
  left_join(.,pF) %>% 
  select(-stat) %>% 
  mutate(across(
    .cols = everything(),
    replace_na, ""
  )) %>%
    as.data.frame

### Main hypotheses: McNemar tests {{{1

## Hypothesis 1 {{{2

# Prepare the 2x2 table
H1.data <- rdata %>% 
  arrange(sent) %>%
  group_by(recipient.id) %>%
  slice(1:2) %>%
  select(response, ethnicity, recipient.id) %>%
  pivot_wider(names_from = ethnicity, values_from = response) %>%
  with(table(Roma, Czech))

# Print the table and the test
H1.data
H1.data %>% sum
H1.data %>% 
  exact2x2(paired = T, midp = T)

## Hypothesis 2 {{{2

# Prepare the data
H2.data <- rdata %>% 
  arrange(sent) %>%
  group_by(recipient.id) %>%
  slice(2:3) %>%
  select(response, literacy, recipient.id) %>%
  pivot_wider(names_from = literacy, values_from = response) %>%
  with(table(High, Low)) 

# Print the table and the test
H2.data
H1.data %>% sum
H2.data %>% 
  exact2x2(paired = T, midp = T)

## Hypothesis 3 {{{2 

# Prepare the 2x2 table
H3.data <- rdata %>% 
  filter(ethnicity == 'Czech') %>%
  group_by(recipient.id) %>%
  filter(n() == 2) %>%
  select(response, literacy, recipient.id) %>%
  pivot_wider(names_from = literacy, values_from = response) %>%
  with(table(High, Low)) 

# Print the table and the test
H3.data
H3.data %>% sum
H3.data %>%
  exact2x2(paired = T, midp = T, alternative = 'less')

## Hypothesis 4 {{{2

# Prepare the 2x2 table
H4.data <- H1.data

# Print the table and the test
H4.data %>% sum
H4.data %>% 
  exact2x2(paired = T, midp = T, alternative = 'greater')

## Hypothesis 5 {{{2

# Prepare the 2x2 table
H5.data <- rdata %>% 
  filter((literacy == 'High' & ethnicity == 'Roma') | 
         (literacy == 'Low' & ethnicity == 'Czech')) %>%
  group_by(recipient.id) %>%
  filter(n() == 2) %>%
  select(response, ethnicity, recipient.id) %>%
  pivot_wider(names_from = ethnicity, values_from = response) %>%
  with(table(Czech, Roma)) 

# Print the table and the test
H5.data
H5.data %>% sum
H5.data %>%
  exact2x2(paired = T, midp = T)

### Main hypotheses: Table 2 {{{1

# Estimate the random effects regressions
dp <- rdata %>%
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

## Hausman tests of consistency of random effects estimator {{{2

# Estimate corresponding fixed effects regressions
lm.fe0 <- plm(response ~ ethnicity, data = dp, model = 'within')
lm.fe1 <- plm(response ~ literacy, data = dp, model = 'within')
lm.fe2 <- plm(response ~ literacy, data = dp, model = 'within', 
           subset = ethnicity == 'Czech')
lm.fe3 <- plm(response ~ literacy, data = dp, model = 'within', 
           subset = ethnicity == 'Roma')
lm.fe4 <- plm(response ~ ethnicity, data = dp, model = 'within', 
           subset = literacy == 'High')
lm.fe5 <- plm(response ~ ethnicity, data = dp, model = 'within', 
           subset = literacy == 'Low')
lm.fe6 <- plm(response ~ ethnicity, data = dp, model = 'within', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))

# Print the Hausman tests
phtest(lm.fe0, lm0)
phtest(lm.fe1, lm1)
phtest(lm.fe2, lm2)
phtest(lm.fe3, lm3)
phtest(lm.fe4, lm4)
phtest(lm.fe5, lm5)
phtest(lm.fe6, lm6)

### Complementary results {{{1

## Testing for socioeconomic discrimination within the Roma subsample {{{2

# Prepare the 2x2 table
H3roma.data <- rdata %>% 
  filter(ethnicity == 'Roma') %>%
  group_by(recipient.id) %>%
  filter(n() == 2) %>%
  select(response, literacy, recipient.id) %>%
  pivot_wider(names_from = literacy, values_from = response) %>%
  with(table(High, Low)) 

# Print the table and the test
H3roma.data
H3roma.data %>% sum
H3roma.data %>%
  exact2x2(paired = T, midp = T, alternative = 'less')

## Testing for ethnic discrimination, by literacy level {{{2

# High literacy arm {{{3

# Prepare the 2x2 table
H4highlit.data <- rdata %>% 
  filter(literacy == 'High') %>%
  group_by(recipient.id) %>%
  filter(n() == 2) %>%
  select(response, ethnicity, recipient.id) %>%
  pivot_wider(names_from = ethnicity, values_from = response) %>%
  with(table(Czech, Roma)) 

# Print the table and the test
H4highlit.data
H4highlit.data %>% sum
H4highlit.data %>%
  exact2x2(paired = T, midp = T, alternative = 'less')

# Low literacy arm {{{3

# Prepare the 2x2 table
H4lowlit.data <- rdata %>% 
  filter(literacy == 'Low') %>%
  group_by(recipient.id) %>%
  filter(n() == 2) %>%
  select(response, ethnicity, recipient.id) %>%
  pivot_wider(names_from = ethnicity, values_from = response) %>%
  with(table(Czech, Roma)) 

# Print the table and the test
H4lowlit.data
H4lowlit.data %>% sum
H4lowlit.data %>%
  exact2x2(paired = T, midp = T, alternative = 'less')

### Using only between-subject variation: Table 3 {{{1

# Estimate the OLS regressions
dp <- rdata %>%
  arrange(sent) %>%
  group_by(recipient.id) %>%
  slice(1)
lm0 <- lm(response ~ ethnicity, data = dp)
lm1 <- lm(response ~ literacy, data = dp)
lm2 <- lm(response ~ literacy, data = dp,
          subset = ethnicity == 'Czech')
lm3 <- lm(response ~ literacy, data = dp,
          subset = ethnicity == 'Roma')
lm4 <- lm(response ~ ethnicity, data = dp,
          subset = literacy == 'High')
lm5 <- lm(response ~ ethnicity, data = dp,
          subset = literacy == 'Low')
lm6 <- lm(response ~ ethnicity, data = dp,
          subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
stargazer(models,
          se = lapply(models, function(m) {
                        sqrt(diag(vcovHC(m,type="HC3")))
                                   }),
          type = 'text',
          star.cutoffs = c(0.05, 0.01, 0.001),
          keep.stat = c('adj.rsq', 'n'),
          dep.var.caption = '',
          dep.var.labels.include = F,
          column.labels=(c('All', 'All', 'Czechs', 'Roma', 'High',
                           'Low', 'HR LC'))
          )

# Low literacy: Table 2 vs Table 3

pnorm(1 - (0.059 - (-0.044)) / sqrt(0.068^2 + 0.030^2))

### Exploring heterogeneity in discrimination: Table 4 {{{1

## A: Dropping male unemployment specialists {{{2

# Estimate the random effects regressions
dp <- rdata %>%
  filter(female) %>% 
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

## B: Dropping unemployment specialists located in Prague {{{2

# Estimate the random effects regressions
dp <- rdata %>%
  filter(!prague) %>% 
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

## C: Dropping unemployment specialists in cities larger than 100k {{{2

# Estimate the random effects regressions
dp <- rdata %>%
  filter(!prague & !city100k) %>% 
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

## (Not reported): Dropping unempl. specialists in cities larger than 50k {{{2

# Estimate the random effects regressions
dp <- rdata %>%
  filter(!prague & !city100k & !city50k) %>% 
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

## Controlling to for exposure to socially excluded locations {{{2

# Estimate the random effects regressions
dp <- rdata %>%
  filter(!is.na(excluded.share)) %>% 
  pdata.frame(index = 'recipient.id')
lm0 <- plm(response ~ ethnicity * excluded.share, data = dp, model = 'random')
lm1 <- plm(response ~ literacy * excluded.share, data = dp, model = 'random')
lm2 <- plm(response ~ literacy * excluded.share, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy * excluded.share, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity * excluded.share, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity * excluded.share, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity * excluded.share, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()

# (Not reported) Check main results in the subsample with excluded.share {{{3

lm0 <- plm(response ~ ethnicity, data = dp, model = 'random')
lm1 <- plm(response ~ literacy, data = dp, model = 'random')
lm2 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Czech')
lm3 <- plm(response ~ literacy, data = dp, model = 'random', 
           subset = ethnicity == 'Roma')
lm4 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'High')
lm5 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = literacy == 'Low')
lm6 <- plm(response ~ ethnicity, data = dp, model = 'random', 
           subset = (literacy == 'High' & ethnicity == 'Roma') |
           (literacy == 'Low' & ethnicity == 'Czech'))
models <- list(lm0, lm1, lm2, lm3, lm4, lm5, lm6)

# Produce and print the table
produce.and.print.table()
